import libraries¶
import pandas as pd
import numpy as np
# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import dataset¶
import pandas as pd
df=pd.read_csv(r'C:\Users\SACHIN\OneDrive\Documents\ML MODELS DATA\shopping_trends.csv')
df.head()
| Customer ID | Age | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 55 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | Fortnightly |
| 1 | 2 | 19 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | Fortnightly |
| 2 | 3 | 50 | Male | Jeans | Clothing | 73 | Massachusetts | S | Maroon | Spring | 3.1 | Yes | Cash | Free Shipping | Yes | Yes | 23 | Credit Card | Weekly |
| 3 | 4 | 21 | Male | Sandals | Footwear | 90 | Rhode Island | M | Maroon | Spring | 3.5 | Yes | PayPal | Next Day Air | Yes | Yes | 49 | PayPal | Weekly |
| 4 | 5 | 45 | Male | Blouse | Clothing | 49 | Oregon | M | Turquoise | Spring | 2.7 | Yes | Cash | Free Shipping | Yes | Yes | 31 | PayPal | Annually |
df.shape
(3900, 19)
df.info() # data information
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3900 entries, 0 to 3899 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer ID 3900 non-null int64 1 Age 3900 non-null int64 2 Gender 3900 non-null object 3 Item Purchased 3900 non-null object 4 Category 3900 non-null object 5 Purchase Amount (USD) 3900 non-null int64 6 Location 3900 non-null object 7 Size 3900 non-null object 8 Color 3900 non-null object 9 Season 3900 non-null object 10 Review Rating 3900 non-null float64 11 Subscription Status 3900 non-null object 12 Payment Method 3900 non-null object 13 Shipping Type 3900 non-null object 14 Discount Applied 3900 non-null object 15 Promo Code Used 3900 non-null object 16 Previous Purchases 3900 non-null int64 17 Preferred Payment Method 3900 non-null object 18 Frequency of Purchases 3900 non-null object dtypes: float64(1), int64(4), object(14) memory usage: 579.0+ KB
# cheack null values
df.isnull().sum()
Customer ID 0 Age 0 Gender 0 Item Purchased 0 Category 0 Purchase Amount (USD) 0 Location 0 Size 0 Color 0 Season 0 Review Rating 0 Subscription Status 0 Payment Method 0 Shipping Type 0 Discount Applied 0 Promo Code Used 0 Previous Purchases 0 Preferred Payment Method 0 Frequency of Purchases 0 dtype: int64
Gender¶
df['Gender'].value_counts()
Gender Male 2652 Female 1248 Name: count, dtype: int64
x=df['Gender'].value_counts()
plt.figure(figsize=(8,6))
sns.barplot(x=x.index, y=x.values, palette=colors)
plt.xlabel("Gender")
plt.ylabel('Counts')
plt.xticks(rotation=45)
plt.legend(title='Gender')
for index, value in enumerate(x.values):
plt.text(index, value+2.3, str(value), color='black', ha="right", size='small')
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
Category¶
df['Category'].value_counts()
Category Clothing 1737 Accessories 1240 Footwear 599 Outerwear 324 Name: count, dtype: int64
x=df['Category'].value_counts()
plt.figure(figsize=(8,5))
sns.barplot(x=x.index, y=x.values)
plt.xlabel("Category")
plt.ylabel('COUNTS')
plt.xticks(rotation=45)
plt.show()
x=df['Category'].value_counts()
labels=x.index
explode=[0.1,0.1,0,0]
plt.pie(x,startangle=45,rotatelabels=True,labels=labels,autopct='%.f%%',explode=explode)
plt.title('category size distribution')
plt.show()
Location¶
df['Location'].value_counts()
Location Montana 96 California 95 Idaho 93 Illinois 92 Alabama 89 Minnesota 88 Nebraska 87 New York 87 Nevada 87 Maryland 86 Delaware 86 Vermont 85 Louisiana 84 North Dakota 83 Missouri 81 West Virginia 81 New Mexico 81 Mississippi 80 Indiana 79 Georgia 79 Kentucky 79 Arkansas 79 North Carolina 78 Connecticut 78 Virginia 77 Ohio 77 Tennessee 77 Texas 77 Maine 77 South Carolina 76 Colorado 75 Oklahoma 75 Wisconsin 75 Oregon 74 Pennsylvania 74 Washington 73 Michigan 73 Alaska 72 Massachusetts 72 Wyoming 71 Utah 71 New Hampshire 71 South Dakota 70 Iowa 69 Florida 68 New Jersey 67 Hawaii 65 Arizona 65 Kansas 63 Rhode Island 63 Name: count, dtype: int64
x=df['Location'].value_counts()
plt.figure(figsize=(11,6))
plt.scatter(x.index,y=x.values)
plt.xlabel('locations')
plt.ylabel('counts')
plt.title('Different locations')
plt.xticks(rotation=90)
plt.show()
x=df['Location'].value_counts()
plt.figure(figsize=(10,6))
plt.bar(x.index,height=x.values)
plt.xlabel('locations')
plt.ylabel('counts')
plt.title('Different locations')
plt.xticks(rotation=90)
plt.show()
Size and Season¶
df.head(2)
| Customer ID | Age | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 55 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | Fortnightly |
| 1 | 2 | 19 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | Fortnightly |
x=df["Size"].value_counts()
print(x)
Size M 1755 L 1053 S 663 XL 429 Name: count, dtype: int64
y=df["Season"].value_counts()
print(y)
Season Spring 999 Fall 975 Winter 971 Summer 955 Name: count, dtype: int64
df = pd.DataFrame({
'Size': x.index,
'Count_X': x.values,
'Season': y.index,
'Count_Y': y.values
})
plt.figure(figsize=(6,4))
sns.scatterplot(data=df, x='Count_X', y='Count_Y', hue='Season', palette='viridis', s=170)
plt.xlabel('Size')
plt.ylabel('Season')
plt.title('Scatter Plot of Size vs Season')
plt.legend(title='Season')
plt.show()
Item Purchased¶
x=df['Item Purchased'].value_counts()
x
Item Purchased Blouse 171 Jewelry 171 Pants 171 Shirt 169 Dress 166 Sweater 164 Jacket 163 Belt 161 Sunglasses 161 Coat 161 Sandals 160 Socks 159 Skirt 158 Shorts 157 Scarf 157 Hat 154 Handbag 153 Hoodie 151 Shoes 150 T-shirt 147 Sneakers 145 Boots 144 Backpack 143 Gloves 140 Jeans 124 Name: count, dtype: int64
plt.figure(figsize=(14,8))
plt.bar(x.index,height=x.values)
plt.xlabel('item purchased')
plt.ylabel('counts')
plt.xticks(rotation=45)
for index, value in enumerate(x.values):
plt.text(index,value+2, str(value), color='green', ha="center", size='small')
plt.show()
Relation between item purchased and size¶
data=df.groupby('Item Purchased')['Size'].value_counts().unstack()
print(data)
Size L M S XL Item Purchased Backpack 35 76 18 14 Belt 39 66 37 19 Blouse 46 75 29 21 Boots 40 70 21 13 Coat 45 66 36 14 Dress 47 77 27 15 Gloves 35 66 22 17 Handbag 34 72 29 18 Hat 41 67 23 23 Hoodie 40 68 26 17 Jacket 48 82 20 13 Jeans 39 41 26 18 Jewelry 39 77 37 18 Pants 46 80 25 20 Sandals 39 75 29 17 Scarf 45 65 28 19 Shirt 41 86 20 22 Shoes 47 66 22 15 Shorts 46 67 27 17 Skirt 53 67 27 11 Sneakers 46 56 21 22 Socks 40 74 26 19 Sunglasses 39 73 36 13 Sweater 42 77 27 18 T-shirt 41 66 24 16
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(16,14))
data_melted = data.reset_index().melt(id_vars='Item Purchased', var_name='Size', value_name='Count')
colors = sns.color_palette("Set2", len(x))
sns.barplot(x='Item Purchased', y='Count', hue='Size', data=data_melted, palette=colors)
plt.title('Distribution of Item Purchased by Size')
plt.xlabel('Item Purchased')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Size', bbox_to_anchor=(0, 1))
plt.show()
plt.figure(figsize=(16,14))
data_melted = data.reset_index().melt(id_vars='Item Purchased', var_name='Size', value_name='Count')
colors = sns.color_palette("Set2", len(x))
sns.barplot(x='Item Purchased', y='Count', hue='Size', data=data_melted, palette=colors)
plt.title('Distribution of Item Purchased by Size')
plt.xlabel('Item Purchased')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Size', bbox_to_anchor=(0, 1))
for p in plt.gca().patches:
plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 1,
'{:1.0f}'.format(p.get_height()), ha='center', va='center')
plt.show()
relation between Size and Gender¶
data2=df.groupby(['Size','Gender']).size().unstack()
data2
| Gender | Female | Male |
|---|---|---|
| Size | ||
| L | 337 | 716 |
| M | 590 | 1165 |
| S | 187 | 476 |
| XL | 134 | 295 |
plt.figure(figsize=(16,12))
data_melted=data2.reset_index().melt(id_vars='Size',var_name='Gender',value_name='Count')
sns.barplot(data=data_melted,x='Gender',y='Count',hue='Size',palette=colors)
plt.title('Gender Across Different Size')
plt.xlabel('Gender')
plt.ylabel('COUNTS')
plt.show()
plt.figure(figsize=(16,12))
data_melted=data2.reset_index().melt(id_vars='Size',var_name='Gender',value_name='Count')
sns.barplot(data=data_melted,x='Gender',y='Count',hue='Size',palette=colors)
plt.title('Gender Across Different Size')
plt.xlabel('Gender')
plt.ylabel('COUNTS')
for p in plt.gca().patches:
plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 10,
'{:1.0f}'.format(p.get_height()), ha='right', va='center')
plt.show()
category wise subscription status¶
x=df.groupby(['Category','Subscription Status']).size().unstack()
x
| Subscription Status | No | Yes |
|---|---|---|
| Category | ||
| Accessories | 906 | 334 |
| Clothing | 1280 | 457 |
| Footwear | 428 | 171 |
| Outerwear | 233 | 91 |
plt.figure(figsize=(16,10))
data_melt=x.reset_index().melt(id_vars='Category',var_name='Subscription Status',value_name='Count')
sns.barplot(data=data_melt,x='Subscription Status',y='Count',hue='Category')
plt.title('Subscription Status across differnt Category')
plt.xlabel('Subscription Status')
plt.ylabel('count')
plt.xticks(rotation=30)
plt.show()
plt.figure(figsize=(16,10))
data_melt=x.reset_index().melt(id_vars='Category',var_name='Subscription Status',value_name='Count')
sns.barplot(data=data_melt,x='Subscription Status',y='Count',hue='Category')
plt.title('Subscription Status across differnt Category')
plt.xlabel('Subscription Status')
plt.ylabel('count')
plt.xticks(rotation=30)
for p in plt.gca().patches:
plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 14,
'{:1.0f}'.format(p.get_height()), ha='right', va='center')
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 10))
x.plot(kind='box', stacked=True, figsize=(16, 10), colormap='tab20')
plt.title('Subscription Status across different categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=30)
plt.legend(title='Subscription Status')
plt.show()
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
<Figure size 1600x1000 with 0 Axes>
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 10))
ax=x.plot(kind='bar', stacked=True, figsize=(16, 10), colormap='tab20')
plt.title('Subscription Status across different categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=360)
plt.legend(title='Subscription Status')
plt.show()
<Figure size 1600x1000 with 0 Axes>
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 10))
ax=x.plot(kind='bar', stacked=True, figsize=(16, 10), colormap='tab20')
plt.title('Subscription Status across different categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=360)
plt.legend(title='Subscription Status')
for container in ax.containers:
ax.bar_label(container, label_type='center', fmt='%d')
plt.show()
<Figure size 1600x1000 with 0 Axes>
Payment Method with respect to Discount Applied¶
x=df.groupby(['Payment Method','Discount Applied']).size().unstack()
x
| Discount Applied | No | Yes |
|---|---|---|
| Payment Method | ||
| Bank Transfer | 361 | 271 |
| Cash | 358 | 290 |
| Credit Card | 404 | 292 |
| Debit Card | 363 | 270 |
| PayPal | 372 | 266 |
| Venmo | 365 | 288 |
data_melt=x.reset_index().melt(id_vars='Payment Method',var_name='Discount Applied',value_name='Counts')
sns.barplot(data=data_melt,x='Payment Method',y='Counts',hue='Discount Applied')
plt.title('Payment Method effects Discount Applied')
plt.xlabel('Payment Method')
plt.ylabel('Counts')
plt.xticks(rotation=30)
plt.show()
ax=x.plot(kind='bar',stacked=True,colormap='tab20')
for container in ax.containers:
ax.bar_label(container, label_type='center', fmt='%d')
plt.show()
x=df['Payment Method'].value_counts()
print(x)
Payment Method Credit Card 696 Venmo 653 Cash 648 PayPal 638 Debit Card 633 Bank Transfer 632 Name: count, dtype: int64
y=df['Discount Applied'].value_counts()
print(y)
Discount Applied No 2223 Yes 1677 Name: count, dtype: int64
a.What is the distribution of customer ages?
age=df['Age'].value_counts().sort_values()
age
Age 44 51 67 54 20 62 33 63 60 65 61 65 22 66 70 67 39 68 24 68 48 68 34 68 18 69 26 69 21 69 53 70 38 70 23 71 30 71 66 71 47 71 45 72 40 72 65 72 51 72 35 72 64 73 55 73 52 73 36 74 56 74 63 75 59 75 68 75 29 76 46 76 37 77 31 79 28 79 43 79 42 80 19 81 58 81 32 82 62 83 27 83 54 83 50 83 49 84 25 85 41 86 57 87 69 88 Name: count, dtype: int64
df['Age'].describe()
count 3900.000000 mean 44.068462 std 15.207589 min 18.000000 25% 31.000000 50% 44.000000 75% 57.000000 max 70.000000 Name: Age, dtype: float64
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], kde=True, color='blue')
plt.title('Age Distribution of Customers', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Counts', fontsize=12)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
b. Is there a difference in the age distribution between genders?
df.groupby('Gender')['Age'].value_counts()
Gender Age
Female 45 33
52 32
57 32
62 32
31 30
..
Male 67 40
33 40
45 39
22 38
44 28
Name: count, Length: 106, dtype: int64
df.groupby('Gender')['Age'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Gender | ||||||||
| Female | 1248.0 | 44.007212 | 14.953843 | 18.0 | 31.0 | 44.0 | 57.0 | 70.0 |
| Male | 2652.0 | 44.097285 | 15.328257 | 18.0 | 31.0 | 44.0 | 57.0 | 70.0 |
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Gender', y='Age', palette='viridis')
plt.title('Age Distribution by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Age', fontsize=12)
plt.show()
2. Gender Distribution:¶
a.What is the proportion of males and females in the dataset?
df['Gender'].value_counts()*100/3900
Gender Male 68.0 Female 32.0 Name: count, dtype: float64
3. Location Analysis:¶
a.Which locations have the most customers?
x=df['Location'].value_counts()
x.head(10)
Location Montana 96 California 95 Idaho 93 Illinois 92 Alabama 89 Minnesota 88 Nebraska 87 New York 87 Nevada 87 Maryland 86 Name: count, dtype: int64
plt.figure(figsize=(12, 6))
sns.barplot(x=x.index, y=x.values, palette='viridis')
plt.title('Top 10 Locations with Most Customers', fontsize=16)
plt.xlabel('Location', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.xticks(rotation=45)
for index, value in enumerate(x.values):
plt.text(index,value+2, str(value), color='green', ha="center", size='small')
plt.show()
x=df['Purchase Amount (USD)'].value_counts().reset_index(inplace=False)
x
| Purchase Amount (USD) | count | |
|---|---|---|
| 0 | 94 | 62 |
| 1 | 32 | 62 |
| 2 | 36 | 62 |
| 3 | 51 | 61 |
| 4 | 90 | 60 |
| ... | ... | ... |
| 76 | 100 | 36 |
| 77 | 87 | 35 |
| 78 | 49 | 35 |
| 79 | 69 | 34 |
| 80 | 61 | 33 |
81 rows × 2 columns
df['Purchase Amount (USD)'].max()
100
plt.figure(figsize=(10, 6))
sns.scatterplot(x=x['Purchase Amount (USD)'],y=x['count'])
plt.title('Purchase Amount (USD) Distribution', fontsize=16)
plt.xlabel('Purchase Amount (USD)', fontsize=12)
plt.ylabel('Counts', fontsize=12)
plt.show()
df.head()
| Customer ID | Age | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 55 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | Fortnightly |
| 1 | 2 | 19 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | Fortnightly |
| 2 | 3 | 50 | Male | Jeans | Clothing | 73 | Massachusetts | S | Maroon | Spring | 3.1 | Yes | Cash | Free Shipping | Yes | Yes | 23 | Credit Card | Weekly |
| 3 | 4 | 21 | Male | Sandals | Footwear | 90 | Rhode Island | M | Maroon | Spring | 3.5 | Yes | PayPal | Next Day Air | Yes | Yes | 49 | PayPal | Weekly |
| 4 | 5 | 45 | Male | Blouse | Clothing | 49 | Oregon | M | Turquoise | Spring | 2.7 | Yes | Cash | Free Shipping | Yes | Yes | 31 | PayPal | Annually |
import plotly.express as px
fig=px.scatter(x,x=x['Purchase Amount (USD)'],y=x['count'],
title=("purchase amount distribution"),height=600)
fig.show()
import plotly.express as px
fig=px.histogram(df,df['Purchase Amount (USD)'],
title=("purchase amount distribution"),height=600)
fig.show()
import plotly.express as px
fig=px.histogram(df,df['Age'],
title=("purchase amount distribution"),height=600)
fig.show()
df['Age'].value_counts()
Age 69 88 57 87 41 86 25 85 49 84 50 83 54 83 27 83 62 83 32 82 19 81 58 81 42 80 43 79 28 79 31 79 37 77 46 76 29 76 68 75 59 75 63 75 56 74 36 74 55 73 52 73 64 73 35 72 51 72 65 72 40 72 45 72 47 71 66 71 30 71 23 71 38 70 53 70 18 69 21 69 26 69 34 68 48 68 24 68 39 68 70 67 22 66 61 65 60 65 33 63 20 62 67 54 44 51 Name: count, dtype: int64
b. Are there differences in purchase amounts between genders?
df.groupby('Gender')['Purchase Amount (USD)'].value_counts()
Gender Purchase Amount (USD)
Female 82 24
32 22
52 22
99 21
70 21
..
Male 70 23
65 23
61 23
52 22
87 22
Name: count, Length: 162, dtype: int64
df.groupby('Gender')['Purchase Amount (USD)'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Gender | ||||||||
| Female | 1248.0 | 60.249199 | 23.420556 | 20.0 | 40.0 | 60.0 | 81.0 | 100.0 |
| Male | 2652.0 | 59.536199 | 23.809976 | 20.0 | 38.0 | 60.0 | 80.0 | 100.0 |
plt.figure(figsize=(10, 6))
sns.boxplot(x='Gender', y='Purchase Amount (USD)', data=df, palette='viridis')
plt.title('Distribution of Purchase Amounts by Gender', fontsize=14)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Purchase Amount (USD)', fontsize=12)
plt.grid(axis='y')
plt.show()
5. Category Popularity:¶
a.Which product categories are purchased the most?
x=df['Category'].value_counts()
x
Category Clothing 1737 Accessories 1240 Footwear 599 Outerwear 324 Name: count, dtype: int64
plt.figure(figsize=(10, 6))
sns.barplot(x=x.index, y=x.values, palette='viridis')
plt.title('Product Categories', fontsize=14)
plt.xlabel('Number of Purchases', fontsize=12)
plt.ylabel('Category', fontsize=12)
for index, value in enumerate(x.values):
plt.text(index, value + 10, str(value), ha='center', fontsize=11)
plt.show()
x=df['Category'].value_counts().reset_index()
x
| Category | count | |
|---|---|---|
| 0 | Clothing | 1737 |
| 1 | Accessories | 1240 |
| 2 | Footwear | 599 |
| 3 | Outerwear | 324 |
import plotly.express as px
fig=px.bar(x,x['Category'],y=x['count'],
title=("Category Popularity"),color='Category',height=600)
fig.show()
6. Item Analysis:¶
a.What are the most purchased items in each category?
x = df.groupby(['Category', 'Item Purchased']).size().reset_index(name='Count')
x
| Category | Item Purchased | Count | |
|---|---|---|---|
| 0 | Accessories | Backpack | 143 |
| 1 | Accessories | Belt | 161 |
| 2 | Accessories | Gloves | 140 |
| 3 | Accessories | Handbag | 153 |
| 4 | Accessories | Hat | 154 |
| 5 | Accessories | Jewelry | 171 |
| 6 | Accessories | Scarf | 157 |
| 7 | Accessories | Sunglasses | 161 |
| 8 | Clothing | Blouse | 171 |
| 9 | Clothing | Dress | 166 |
| 10 | Clothing | Hoodie | 151 |
| 11 | Clothing | Jeans | 124 |
| 12 | Clothing | Pants | 171 |
| 13 | Clothing | Shirt | 169 |
| 14 | Clothing | Shorts | 157 |
| 15 | Clothing | Skirt | 158 |
| 16 | Clothing | Socks | 159 |
| 17 | Clothing | Sweater | 164 |
| 18 | Clothing | T-shirt | 147 |
| 19 | Footwear | Boots | 144 |
| 20 | Footwear | Sandals | 160 |
| 21 | Footwear | Shoes | 150 |
| 22 | Footwear | Sneakers | 145 |
| 23 | Outerwear | Coat | 161 |
| 24 | Outerwear | Jacket | 163 |
y = x.loc[x.groupby('Category')['Count'].idxmax()]
y
| Category | Item Purchased | Count | |
|---|---|---|---|
| 5 | Accessories | Jewelry | 171 |
| 8 | Clothing | Blouse | 171 |
| 20 | Footwear | Sandals | 160 |
| 24 | Outerwear | Jacket | 163 |
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
plt.figure(figsize=(8, 5))
plt.bar(y['Category'], y['Count'], color='skyblue', width=0.5)
plt.title('Most Purchased Items by Category (Matplotlib)', fontsize=14)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Category', fontsize=12)
plt.xticks(rotation=360)
plt.tight_layout()
for i, value in enumerate(y['Count']):
plt.text(i, value + 2, str(value), ha='center', fontsize=10)
plt.show()
plt.figure(figsize=(8, 5))
sns.barplot(x=y['Category'], y=y['Count'], palette='viridis',width=0.5)
plt.title('Most Purchased Items by Category (Seaborn)', fontsize=14)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Category', fontsize=12)
plt.xticks(rotation=360)
plt.tight_layout()
for i, value in enumerate(y['Count']):
plt.text(i, value + 2, str(value), ha='center', fontsize=10)
plt.show()
fig = px.bar(
x=y['Category'],
y=y['Count'],
text=y['Count'],
labels={'x': 'Category', 'y': 'Count'},
title='Most Purchased Items by Category (Plotly)',
color=y['Category'], width=800, height=500)
fig.show()
fig = px.bar(
x=y['Category'],
y=y['Count'],
labels={'x': 'Category', 'y': 'Count'},
title='Most Purchased Items by Category (Plotly)',
color=y['Category'], width=800, height=500)
fig.show()
7. Size and Color Preferences:¶
a.What sizes and colors are the most popular among customers?
df.head(2)
| Customer ID | Age | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 55 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | Fortnightly |
| 1 | 2 | 19 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | Fortnightly |
size_popularity = df['Size'].value_counts()
size_popularity
Size M 1755 L 1053 S 663 XL 429 Name: count, dtype: int64
color_popularity = df['Color'].value_counts()
color_popularity
Color Olive 177 Yellow 174 Silver 173 Teal 172 Green 169 Black 167 Cyan 166 Violet 166 Gray 159 Maroon 158 Orange 154 Charcoal 153 Pink 153 Magenta 152 Blue 152 Purple 151 Peach 149 Red 148 Beige 147 Indigo 147 Lavender 147 Turquoise 145 White 142 Brown 141 Gold 138 Name: count, dtype: int64
ax=df['Size'].value_counts()
labels=ax.index
explode=[0.1,0.1,0,0]
plt.pie(ax,startangle=45,rotatelabels=True,labels=labels,autopct='%.f%%',explode=explode)
plt.title('Popular Size')
plt.show()
plt.figure(figsize=(12, 6))
sns.barplot(x=size_popularity.index, y=size_popularity.values, palette='viridis', width=0.5)
plt.title('Popularity of Sizes', fontsize=14)
plt.xlabel('Size', fontsize=12)
plt.ylabel('Number of Purchases', fontsize=12)
plt.tight_layout()
for index, value in enumerate(size_popularity.values):
plt.text(index, value + 0.5, str(value), color='black', ha='center', fontsize=10)
plt.show()
sns.barplot(x=color_popularity.index, y=color_popularity.values, palette='viridis')
plt.title('Popularity of Colors', fontsize=14)
plt.ylabel('Number of Purchases', fontsize=12)
plt.xlabel('Color', fontsize=12)
plt.xticks(rotation=90)
plt.tight_layout()
for index, value in enumerate(color_popularity.values):
plt.text(index, value + 0.5, str(value), color='black', ha='center', fontsize=6)
plt.show()
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(x=size_popularity.index, y=size_popularity.values, palette='viridis')
plt.title('Popularity of Sizes', fontsize=14)
plt.xlabel('Size', fontsize=12)
plt.ylabel('Number of Purchases', fontsize=12)
plt.subplot(1, 2, 2)
sns.barplot(x=color_popularity.index, y=color_popularity.values, palette='viridis')
plt.title('Popularity of Colors', fontsize=14)
plt.ylabel('Number of Purchases', fontsize=12)
plt.xlabel('Color', fontsize=12)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
b. Is there a correlation between age and size preferences?
x=df.groupby('Size')['Age'].mean()
x
Size L 44.571700 M 44.025641 S 43.865762 XL 43.321678 Name: Age, dtype: float64
plt.figure(figsize=(8, 6))
sns.barplot(x=x.index, y=x.values, palette="viridis", width=0.5)
plt.title('Average Age by Size Preference', fontsize=14)
plt.xlabel('Size', fontsize=12)
plt.ylabel('Average Age', fontsize=12)
for index, value in enumerate(x.values):
plt.text(index, value + 0.5, f"{value:.2f}", color='black', ha='center', fontsize=10)
plt.tight_layout()
plt.show()
df['Size_Numeric'] = df['Size'].map({'S': 1, 'M': 2, 'L': 3, 'XL': 4})
correlation = df['Age'].corr(df['Size_Numeric'])
correlation
0.0004505343087438301
plt.figure(figsize=(8, 6))
sns.regplot(x='Age', y='Size_Numeric', data=df)
plt.title('Relationship Between Age and Size Preferences', fontsize=14)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Size (Numeric)', fontsize=12)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
sns.barplot(x='Age', y='Size_Numeric', data=df)
plt.title('Age vs. Size Preferences')
plt.xlabel('Age')
plt.ylabel('Size (Numeric)')
plt.xticks(rotation=90)
plt.show()
fig = px.box(df, x='Size', y='Age', title='Age Distribution by Size Preferences',
labels={'Age': 'Age', 'Size': 'Size'}, color='Size', template='plotly')
fig.show()
season_item = df.groupby(['Season', 'Item Purchased']).size().reset_index(name='Count')
most_purchased_items = season_item.loc[season_item.groupby('Season')['Count'].idxmax()].reset_index(drop=True)
most_purchased_items
| Season | Item Purchased | Count | |
|---|---|---|---|
| 0 | Fall | Jacket | 54 |
| 1 | Spring | Sweater | 52 |
| 2 | Summer | Pants | 50 |
| 3 | Winter | Sunglasses | 52 |
plt.figure(figsize=(10, 6))
sns.barplot(data=most_purchased_items, x='Season', y='Count', hue='Item Purchased', palette="viridis")
plt.title('Most Purchased Items in Each Season', fontsize=14)
plt.xlabel('Season', fontsize=12)
plt.ylabel('Purchase Count', fontsize=12)
plt.legend(title='Item Purchased', loc='upper left', bbox_to_anchor=(1, 1))
for container in plt.gca().containers:
plt.bar_label(container, fmt='%.0f', label_type='edge', fontsize=10, color='black')
plt.show()
plt.figure(figsize=(8, 8))
explode=[0.1,0.1,0,0]
plt.pie(most_purchased_items['Count'], explode=explode, labels=most_purchased_items['Item Purchased'], autopct='%1.1f%%', startangle=140)
plt.title("Item Purchase Distribution", fontsize=14)
plt.show()
b. How does the purchase amount vary by season?
avg= df.groupby('Season')['Purchase Amount (USD)'].mean().reset_index()
avg
| Season | Purchase Amount (USD) | |
|---|---|---|
| 0 | Fall | 61.556923 |
| 1 | Spring | 58.737738 |
| 2 | Summer | 58.405236 |
| 3 | Winter | 60.357364 |
plt.figure(figsize=(8, 8))
explode=[0,0.1,0.2,0.1]
plt.pie(avg['Purchase Amount (USD)'], explode=explode, labels=avg['Season'], autopct='%1.1f%%', startangle=140)
plt.title('Proportion of Average Purchase Amount by Season', fontsize=14)
plt.show()
sns.lineplot(x='Season', y='Purchase Amount (USD)', data=avg, marker='o')
plt.title('Average Purchase Amount (USD) by Season', fontsize=14)
plt.ylabel('Average Purchase Amount (USD)', fontsize=12)
plt.xlabel('Season', fontsize=12)
plt.grid(alpha=0.5)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
9. Frequency of Purchases:¶
a.How often do customers make purchases (weekly, fortnightly, annually)?
df.head(2)
| Customer ID | Age | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | Size_Numeric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 55 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | Fortnightly | 3 |
| 1 | 2 | 19 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | Fortnightly | 3 |
filter_df = df[df['Frequency of Purchases'].isin(['Weekly', 'Fortnightly', 'Annually'])]
counts = filter_df['Frequency of Purchases'].value_counts().reset_index()
counts
| Frequency of Purchases | count | |
|---|---|---|
| 0 | Annually | 572 |
| 1 | Fortnightly | 542 |
| 2 | Weekly | 539 |
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency of Purchases', y='count', data=counts, palette='viridis', width=0.5)
plt.title('Frequency of Purchases by Customers', fontsize=14)
plt.ylabel('Number of Customers', fontsize=12)
plt.xlabel('Frequency of Purchases', fontsize=12)
for container in plt.gca().containers:
plt.bar_label(container, fmt='%.0f', label_type='edge', fontsize=10, color='black')
plt.show()
sns.lineplot(x='Frequency of Purchases', y='count', data=counts, marker='o')
plt.title('Frequency of Purchases by Customers', fontsize=14)
plt.ylabel('Number of Customers', fontsize=12)
plt.xlabel('Frequency of Purchases', fontsize=12)
plt.grid(alpha=0.5)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
df.groupby('Subscription Status')['Purchase Amount (USD)'].mean()
Subscription Status No 59.865121 Yes 59.491928 Name: Purchase Amount (USD), dtype: float64
plt.figure(figsize=(10, 6))
sns.barplot(x='Subscription Status', y='Purchase Amount (USD)', data=df, palette='viridis', width=0.5)
plt.title('Effect of Subscription Status on Purchase Amount', fontsize=14)
plt.xlabel('Subscription Status', fontsize=12)
plt.ylabel('Average Purchase Amount (USD)', fontsize=12)
plt.tight_layout()
for container in plt.gca().containers:
plt.bar_label(container, fmt='%.0f', fontsize=10, color='black')
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(x='Subscription Status', y='Purchase Amount (USD)', data=df, palette='viridis')
plt.title('Effect of Subscription Status on Purchase Frequency', fontsize=14)
plt.xlabel('Subscription Status', fontsize=12)
plt.ylabel('Frequency of Purchases', fontsize=12)
plt.tight_layout()
plt.show()
Are subscribed customers spending more than non-subscribed customers
df.head(2)
| Customer ID | Age | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 55 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | Fortnightly |
| 1 | 2 | 19 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | Fortnightly |
11. Previous Purchases:¶
• Is there a correlation between the number of previous purchases and the review ratings?
corr = df['Previous Purchases'].corr(df['Review Rating'])
corr
0.004229099465270933
fig = px.scatter(df, x='Previous Purchases', y='Review Rating', title='Previous Purchases vs Review Rating', labels={'Previous Purchases': 'Previous Purchases', 'Review Rating': 'Review Rating'})
fig.show()
sns.regplot(x='Previous Purchases', y='Review Rating', data=df, scatter=False, color='red')
plt.title('Previous Purchases vs Review Rating', fontsize=16)
plt.xlabel('Previous Purchases', fontsize=12)
plt.ylabel('Review Rating', fontsize=12)
plt.show()
12. Review Ratings:¶
• What is the distribution of review ratings for products?
df['Review Rating'].value_counts()
Review Rating 3.4 182 4.0 181 4.6 174 4.2 171 2.9 170 4.9 166 3.9 163 3.0 162 2.6 159 4.4 158 3.1 157 3.7 156 3.5 156 2.7 154 3.3 152 3.2 152 3.6 149 4.7 148 4.1 148 4.3 147 4.8 144 3.8 142 4.5 139 2.8 136 5.0 68 2.5 66 Name: count, dtype: int64
df['Review Rating'].describe()
count 3900.000000 mean 3.749949 std 0.716223 min 2.500000 25% 3.100000 50% 3.700000 75% 4.400000 max 5.000000 Name: Review Rating, dtype: float64
plt.figure(figsize=(10, 6))
sns.histplot(df['Review Rating'], kde=True, color='blue')
plt.title('Review Rating Distribution for products', fontsize=16)
plt.xlabel('Review Rating', fontsize=12)
plt.ylabel('Counts', fontsize=12)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['Review Rating'], color='lightgreen')
plt.title('Review Rating Distribution for Products', fontsize=16)
plt.xlabel('Review Rating', fontsize=12)
plt.show()
fig = px.box(df, y="Review Rating",
title="Review Rating Distribution for Products")
fig.show()
• How do review ratings vary across product categories?
dist = df.groupby('Category')['Review Rating'].describe()
dist
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Category | ||||||||
| Accessories | 1240.0 | 3.768629 | 0.715317 | 2.5 | 3.2 | 3.8 | 4.4 | 5.0 |
| Clothing | 1737.0 | 3.723143 | 0.717671 | 2.5 | 3.1 | 3.7 | 4.3 | 5.0 |
| Footwear | 599.0 | 3.790651 | 0.719843 | 2.5 | 3.2 | 3.8 | 4.4 | 5.0 |
| Outerwear | 324.0 | 3.746914 | 0.702598 | 2.5 | 3.1 | 3.8 | 4.3 | 5.0 |
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Category', y='Review Rating', palette='Set2')
plt.title('Distribution of Review Ratings Across Product Categories', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Review Rating', fontsize=12)
plt.tight_layout()
plt.show()
import plotly.express as px
fig = px.box(df, x='Category', y='Review Rating', color='Category',
title='Distribution of Review Ratings Across Product Categories',
labels={'Category': 'Product Category', 'Review Rating': 'Review Rating'}, width=800, height=500)
fig.show()
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Category', y='Review Rating', marker='o', color='blue', linewidth=2.5)
plt.title('Average Review Ratings Across Product Categories', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Average Review Rating', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
category_means = df.groupby('Category')['Review Rating'].mean().reset_index()
category_means
| Category | Review Rating | |
|---|---|---|
| 0 | Accessories | 3.768629 |
| 1 | Clothing | 3.723143 |
| 2 | Footwear | 3.790651 |
| 3 | Outerwear | 3.746914 |
plt.figure(figsize=(7, 4))
sns.lineplot(data=category_means, x='Category', y='Review Rating', marker='o', color='blue', linewidth=2.5)
plt.title('Average Review Ratings Across Product Categories', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Average Review Rating', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(alpha=0.5)
plt.tight_layout()
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
payment_method_counts = df['Preferred Payment Method'].value_counts()
payment_method_counts
Preferred Payment Method PayPal 677 Credit Card 671 Cash 670 Debit Card 636 Venmo 634 Bank Transfer 612 Name: count, dtype: int64
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
plt.figure(figsize=(10, 6))
sns.barplot(x=payment_method_counts.index, y=payment_method_counts.values, palette='viridis', width=0.5)
plt.title('Preferred Payment Methods', fontsize=16)
plt.xlabel('Payment Method', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=3600)
plt.tight_layout()
for i, value in enumerate(payment_method_counts.values):
plt.text(i, value + 5, str(value), ha='center', fontsize=10)
plt.show()
plt.figure(figsize=(8, 8))
plt.pie(payment_method_counts.values,labels=payment_method_counts.index,autopct='%1.1f%%',startangle=140)
plt.title('Preferred Payment Methods', fontsize=16)
plt.tight_layout()
plt.show()
sns.lineplot(x=payment_method_counts.index,y=payment_method_counts.values,marker='o',linewidth=2,color='b')
plt.title('Preferred Payment Methods (Line Plot)', fontsize=16)
plt.xlabel('Payment Method', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)
for i, value in enumerate(payment_method_counts.values):
plt.text(i, value + 5, str(value), ha='center', fontsize=10)
plt.tight_layout()
plt.grid(alpha=0.5)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
Does the preferred payment method differ by age or gender?
preferred_payment_by_age_gender = df.groupby(['Gender', 'Age', 'Preferred Payment Method']).size().unstack()
preferred_payment_by_age_gender
| Preferred Payment Method | Bank Transfer | Cash | Credit Card | Debit Card | PayPal | Venmo | |
|---|---|---|---|---|---|---|---|
| Gender | Age | ||||||
| Female | 18 | 3.0 | 6.0 | 5.0 | NaN | 3.0 | 3.0 |
| 19 | 5.0 | 4.0 | 4.0 | 3.0 | 7.0 | 4.0 | |
| 20 | 2.0 | 2.0 | 4.0 | NaN | 2.0 | 2.0 | |
| 21 | 3.0 | 3.0 | 2.0 | 4.0 | 4.0 | 3.0 | |
| 22 | 6.0 | 7.0 | 2.0 | 3.0 | 9.0 | 1.0 | |
| ... | ... | ... | ... | ... | ... | ... | ... |
| Male | 66 | 12.0 | 7.0 | 9.0 | 7.0 | 9.0 | 6.0 |
| 67 | 5.0 | 4.0 | 10.0 | 8.0 | 9.0 | 4.0 | |
| 68 | 10.0 | 13.0 | 5.0 | 8.0 | 6.0 | 11.0 | |
| 69 | 13.0 | 11.0 | 9.0 | 17.0 | 5.0 | 8.0 | |
| 70 | 9.0 | 4.0 | 9.0 | 7.0 | 8.0 | 8.0 |
106 rows × 6 columns
plt.figure(figsize=(12, 8))
sns.heatmap(preferred_payment_by_age_gender,linewidths=0.5)
plt.title('Preferred Payment Methods by Age Group and Gender', fontsize=16)
plt.xlabel('Preferred Payment Method', fontsize=14)
plt.ylabel('Gender and Age Group', fontsize=14)
plt.tight_layout()
plt.show()
melted_data = preferred_payment_by_age_gender.reset_index().melt(id_vars=['Gender', 'Age'], var_name='Preferred Payment Method', value_name='Count')
melted_data
| Gender | Age | Preferred Payment Method | Count | |
|---|---|---|---|---|
| 0 | Female | 18 | Bank Transfer | 3.0 |
| 1 | Female | 19 | Bank Transfer | 5.0 |
| 2 | Female | 20 | Bank Transfer | 2.0 |
| 3 | Female | 21 | Bank Transfer | 3.0 |
| 4 | Female | 22 | Bank Transfer | 6.0 |
| ... | ... | ... | ... | ... |
| 631 | Male | 66 | Venmo | 6.0 |
| 632 | Male | 67 | Venmo | 4.0 |
| 633 | Male | 68 | Venmo | 11.0 |
| 634 | Male | 69 | Venmo | 8.0 |
| 635 | Male | 70 | Venmo | 8.0 |
636 rows × 4 columns
plt.figure(figsize=(14, 8))
sns.barplot(data=melted_data,x='Preferred Payment Method',y='Count',hue='Gender',palette='viridis')
plt.title('Preferred Payment Methods by Age Group and Gender', fontsize=16)
plt.xlabel('Preferred Payment Method', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)
plt.legend(title='Gender', fontsize=12)
plt.tight_layout()
for p in plt.gca().patches:
plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 1,
'{:1.0f}'.format(p.get_height()), ha='center', va='center',fontsize=16)
plt.show()
14. Discounts and Promo Codes:¶
• How often are discounts and promo codes applied?
df.head(2)
| Customer ID | Age | Gender | Item Purchased | Category | Purchase Amount (USD) | Location | Size | Color | Season | Review Rating | Subscription Status | Payment Method | Shipping Type | Discount Applied | Promo Code Used | Previous Purchases | Preferred Payment Method | Frequency of Purchases | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 55 | Male | Blouse | Clothing | 53 | Kentucky | L | Gray | Winter | 3.1 | Yes | Credit Card | Express | Yes | Yes | 14 | Venmo | Fortnightly |
| 1 | 2 | 19 | Male | Sweater | Clothing | 64 | Maine | L | Maroon | Winter | 3.1 | Yes | Bank Transfer | Express | Yes | Yes | 2 | Cash | Fortnightly |
promo_code=df['Promo Code Used'].value_counts()
print(promo_code)
promo_code_percentage = df['Promo Code Used'].value_counts(normalize=True) * 100
print(promo_code_percentage)
Promo Code Used No 2223 Yes 1677 Name: count, dtype: int64 Promo Code Used No 57.0 Yes 43.0 Name: proportion, dtype: float64
discount_count = df['Discount Applied'].value_counts()
print(discount_count)
discount_percentage = df['Discount Applied'].value_counts(normalize=True) * 100
print(discount_percentage)
Discount Applied No 2223 Yes 1677 Name: count, dtype: int64 Discount Applied No 57.0 Yes 43.0 Name: proportion, dtype: float64
Does using a discount or promo code affect the purchase amount?
discount_effect = df.groupby('Discount Applied')['Purchase Amount (USD)'].mean()
promo_code_effect = df.groupby('Promo Code Used')['Purchase Amount (USD)'].mean()
discount_effect, promo_code_effect
(Discount Applied No 60.130454 Yes 59.279070 Name: Purchase Amount (USD), dtype: float64, Promo Code Used No 60.130454 Yes 59.279070 Name: Purchase Amount (USD), dtype: float64)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(x=discount_effect.index, y=discount_effect.values, palette='viridis',width=0.5)
plt.title('Average Purchase Amount by Discount Applied', fontsize=14)
plt.xlabel('Discount Applied', fontsize=12)
plt.ylabel('Average Purchase Amount (USD)', fontsize=12)
for index, value in enumerate(discount_effect.values):
plt.text(index, value + 0.5, str(value), color='black', ha='center', fontsize=12)
plt.subplot(1, 2, 2)
sns.barplot(x=promo_code_effect.index, y=promo_code_effect.values, palette='viridis',width=0.5)
plt.title('Average Purchase Amount by Promo Code Used', fontsize=14)
plt.xlabel('Promo Code Used', fontsize=12)
plt.ylabel('Average Purchase Amount (USD)', fontsize=12)
plt.xticks(rotation=90)
for index, value in enumerate(promo_code_effect.values):
plt.text(index, value + 0.5, str(value), color='black', ha='center', fontsize=12)
plt.tight_layout()
plt.show()
15. Shipping Types:¶
• What are the most common shipping types chosen by customers?
x=df['Shipping Type'].value_counts()
x
Shipping Type Free Shipping 675 Standard 654 Store Pickup 650 Next Day Air 648 Express 646 2-Day Shipping 627 Name: count, dtype: int64
plt.figure(figsize=(10, 6))
sns.barplot(x=x.values, y=x.index, palette='viridis')
plt.title('Most Common Shipping Types Chosen by Customers', fontsize=16)
plt.xlabel('Number of Selections', fontsize=12)
plt.ylabel('Shipping Type', fontsize=12)
for index, value in enumerate(x.values):
plt.text(value + 5, index, str(value), va='center', fontsize=10)
plt.tight_layout()
plt.show()
plt.figure(figsize=(8, 8))
x.plot.pie(autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Shipping Types Chosen by Customers', fontsize=14)
plt.ylabel('')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
plt.plot(x.index, x.values, marker='o', linestyle='-', color='purple')
plt.title('Frequency of Shipping Types Chosen by Customers', fontsize=16)
plt.xlabel('Shipping Type', fontsize=12)
plt.ylabel('Number of Selections', fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.grid(visible=True, linestyle='--', alpha=0.7)
for index, value in enumerate(x.values):
plt.text(index, value + 1.5, str(value), ha='center', fontsize=10, color='black')
plt.tight_layout()
plt.show()
• Does the purchase amount vary based on the shipping type?
a = df.groupby('Shipping Type')['Purchase Amount (USD)'].mean()
a
Shipping Type 2-Day Shipping 60.733652 Express 60.475232 Free Shipping 60.410370 Next Day Air 58.631173 Standard 58.460245 Store Pickup 59.893846 Name: Purchase Amount (USD), dtype: float64
plt.figure(figsize=(10, 6))
a.plot(kind='barh', color='skyblue', edgecolor='black')
plt.title('Average Purchase Amount by Shipping Type', fontsize=16)
plt.xlabel('Average Purchase Amount (USD)', fontsize=12)
plt.ylabel('Shipping Type', fontsize=12)
for index, value in enumerate(a.values):
plt.text(value + 0.2, index, f"${value:.2f}", va='center', fontsize=10)
plt.tight_layout()
plt.show()
correlation_data = df[['Age', 'Purchase Amount (USD)', 'Review Rating', 'Previous Purchases']]
correlation_matrix = correlation_data.corr()
correlation_matrix
| Age | Purchase Amount (USD) | Review Rating | Previous Purchases | |
|---|---|---|---|---|
| Age | 1.000000 | -0.010424 | -0.021949 | 0.040445 |
| Purchase Amount (USD) | -0.010424 | 1.000000 | 0.030776 | 0.008063 |
| Review Rating | -0.021949 | 0.030776 | 1.000000 | 0.004229 |
| Previous Purchases | 0.040445 | 0.008063 | 0.004229 | 1.000000 |
correlation_matrix
| Age | Purchase Amount (USD) | Review Rating | Previous Purchases | |
|---|---|---|---|---|
| Age | 1.000000 | -0.010424 | -0.021949 | 0.040445 |
| Purchase Amount (USD) | -0.010424 | 1.000000 | 0.030776 | 0.008063 |
| Review Rating | -0.021949 | 0.030776 | 1.000000 | 0.004229 |
| Previous Purchases | 0.040445 | 0.008063 | 0.004229 | 1.000000 |
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.show()
17. Season and Frequency Relationship:¶
• Does the frequency of purchases vary across seasons?
x = df['Season'].value_counts()
x
Season Spring 999 Fall 975 Winter 971 Summer 955 Name: count, dtype: int64
plt.figure(figsize=(8, 6))
sns.barplot(x=x.index, y=x.values, palette='viridis', edgecolor='black', width=0.5)
plt.title('Frequency of Purchases Across Seasons', fontsize=16)
plt.xlabel('Season', fontsize=12)
plt.ylabel('Number of Purchases', fontsize=12)
for index, value in enumerate(x.values):
plt.text(index, value + 5, str(value), ha='center', fontsize=10)
plt.tight_layout()
plt.show()
plt.figure(figsize=(6, 6))
x.plot.pie(autopct='%1.1f%%', startangle=140)
plt.title('Frequency of Purchases Across Seasons', fontsize=14)
plt.ylabel('')
plt.tight_layout()
plt.show()
18. Category and Payment Methods:¶
• Are certain payment methods more common for specific product categories?
category_payment_counts = df.groupby(['Category', 'Payment Method']).size().unstack()
category_payment_counts
| Payment Method | Bank Transfer | Cash | Credit Card | Debit Card | PayPal | Venmo |
|---|---|---|---|---|---|---|
| Category | ||||||
| Accessories | 198 | 200 | 245 | 195 | 199 | 203 |
| Clothing | 291 | 281 | 319 | 286 | 274 | 286 |
| Footwear | 98 | 105 | 84 | 91 | 113 | 108 |
| Outerwear | 45 | 62 | 48 | 61 | 52 | 56 |
category_payment_percentage = category_payment_counts.div(category_payment_counts.sum(axis=1), axis=0) * 100
category_payment_percentage
| Payment Method | Bank Transfer | Cash | Credit Card | Debit Card | PayPal | Venmo |
|---|---|---|---|---|---|---|
| Category | ||||||
| Accessories | 15.967742 | 16.129032 | 19.758065 | 15.725806 | 16.048387 | 16.370968 |
| Clothing | 16.753022 | 16.177317 | 18.364997 | 16.465170 | 15.774324 | 16.465170 |
| Footwear | 16.360601 | 17.529215 | 14.023372 | 15.191987 | 18.864775 | 18.030050 |
| Outerwear | 13.888889 | 19.135802 | 14.814815 | 18.827160 | 16.049383 | 17.283951 |
plt.figure(figsize=(6, 6))
sns.heatmap(category_payment_percentage, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title('Payment Method Preferences by Category (Percentage)', fontsize=16)
plt.xlabel('Payment Method', fontsize=12)
plt.ylabel('Category', fontsize=12)
plt.tight_layout()
plt.show()
ax=category_payment_counts.plot(kind='bar', figsize=(12, 8), stacked=False, colormap='tab20', edgecolor='black')
plt.title('Payment Method Preferences by Category', fontsize=16)
plt.xlabel('Category', fontsize=12)
plt.ylabel('Number of Purchases', fontsize=12)
plt.legend(title='Payment Method', fontsize=10, title_fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.tight_layout()
for container in ax.containers:
ax.bar_label(container, fmt='%d', label_type='edge', fontsize=9, padding=4)
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 10))
ax=category_payment_counts.plot(kind='bar', stacked=True, figsize=(16, 10), colormap='tab20')
plt.title('Payment Method Preferences by Category')
plt.xlabel('Category')
plt.ylabel('Number of Purchases')
plt.xticks(rotation=360)
plt.legend(title='Payment Method')
for container in ax.containers:
ax.bar_label(container, label_type='center', fmt='%d')
plt.show()
<Figure size 1600x1000 with 0 Axes>
19. Gender vs. Preferences:¶
• Do preferences for size, color, or category differ by gender?
size_counts = df.groupby(['Gender', 'Size']).size().unstack()
size_counts
| Size | L | M | S | XL |
|---|---|---|---|---|
| Gender | ||||
| Female | 337 | 590 | 187 | 134 |
| Male | 716 | 1165 | 476 | 295 |
size_counts_percent = size_counts.div(size_counts.sum(axis=1), axis=0) * 100
size_counts_percent
| Size | L | M | S | XL |
|---|---|---|---|---|
| Gender | ||||
| Female | 27.003205 | 47.275641 | 14.983974 | 10.737179 |
| Male | 26.998492 | 43.929110 | 17.948718 | 11.123680 |
size_melted = size_counts.reset_index().melt(id_vars='Gender', var_name='Size', value_name='Count')
plt.figure(figsize=(10, 6))
sns.barplot(data=size_melted, x='Gender', y='Count', hue='Size', palette='viridis', edgecolor='black')
plt.title('Size Preferences by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Size', fontsize=10, title_fontsize=12)
plt.tight_layout()
for p in plt.gca().patches:
plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 15,
'{:1.0f}'.format(p.get_height()), ha='center', va='center',fontsize=10)
plt.show()
color_counts = df.groupby(['Gender', 'Color']).size().unstack()
color_counts
| Color | Beige | Black | Blue | Brown | Charcoal | Cyan | Gold | Gray | Green | Indigo | ... | Peach | Pink | Purple | Red | Silver | Teal | Turquoise | Violet | White | Yellow |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Gender | |||||||||||||||||||||
| Female | 40 | 54 | 52 | 46 | 50 | 47 | 41 | 53 | 56 | 45 | ... | 48 | 58 | 42 | 44 | 52 | 53 | 40 | 53 | 45 | 60 |
| Male | 107 | 113 | 100 | 95 | 103 | 119 | 97 | 106 | 113 | 102 | ... | 101 | 95 | 109 | 104 | 121 | 119 | 105 | 113 | 97 | 114 |
2 rows × 25 columns
color_percent = color_counts.div(color_counts.sum(axis=1), axis=0) * 100
color_percent
| Color | Beige | Black | Blue | Brown | Charcoal | Cyan | Gold | Gray | Green | Indigo | ... | Peach | Pink | Purple | Red | Silver | Teal | Turquoise | Violet | White | Yellow |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Gender | |||||||||||||||||||||
| Female | 3.205128 | 4.326923 | 4.166667 | 3.685897 | 4.006410 | 3.766026 | 3.285256 | 4.246795 | 4.487179 | 3.605769 | ... | 3.846154 | 4.647436 | 3.365385 | 3.525641 | 4.166667 | 4.246795 | 3.205128 | 4.246795 | 3.605769 | 4.807692 |
| Male | 4.034691 | 4.260935 | 3.770739 | 3.582202 | 3.883861 | 4.487179 | 3.657617 | 3.996983 | 4.260935 | 3.846154 | ... | 3.808446 | 3.582202 | 4.110106 | 3.921569 | 4.562594 | 4.487179 | 3.959276 | 4.260935 | 3.657617 | 4.298643 |
2 rows × 25 columns
color_melted = color_counts.reset_index().melt(id_vars='Gender', var_name='Color', value_name='Count')
plt.figure(figsize=(16, 10))
sns.barplot(data=color_melted, x='Gender', y='Count', hue='Color', palette='viridis', edgecolor='black')
plt.title('Color Preferences by Gender', fontsize=16)
plt.xlabel('Color', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Color', bbox_to_anchor=(0,1), fontsize=10, title_fontsize=12)
plt.tight_layout()
for p in plt.gca().patches:
plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 1,
'{:1.0f}'.format(p.get_height()), ha='center', va='center',fontsize=10)
plt.show()
category_counts = df.groupby(['Gender', 'Category']).size().unstack()
category_counts
| Category | Accessories | Clothing | Footwear | Outerwear |
|---|---|---|---|---|
| Gender | ||||
| Female | 392 | 556 | 199 | 101 |
| Male | 848 | 1181 | 400 | 223 |
category_percent = category_counts.div(category_counts.sum(axis=1), axis=0) * 100
category_percent
| Category | Accessories | Clothing | Footwear | Outerwear |
|---|---|---|---|---|
| Gender | ||||
| Female | 31.410256 | 44.551282 | 15.945513 | 8.092949 |
| Male | 31.975867 | 44.532428 | 15.082956 | 8.408748 |
category_melted = category_counts.reset_index().melt(id_vars='Gender', var_name='Category', value_name='Count')
plt.figure(figsize=(12, 8))
barplot = sns.barplot(data=category_melted, x='Gender', y='Count', hue='Category', palette='viridis')
plt.title('Category Preferences by Gender', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Category', fontsize=10, title_fontsize=12)
plt.tight_layout()
for p in plt.gca().patches:
plt.text(p.get_x() + p.get_width() / 2, p.get_height() + 10,
'{:1.0f}'.format(p.get_height()), ha='center', va='center',fontsize=14)
plt.show()
X = ['Age', 'Purchase Amount (USD)', 'Previous Purchases', 'Review Rating']
data = df[X]
data
| Age | Purchase Amount (USD) | Previous Purchases | Review Rating | |
|---|---|---|---|---|
| 0 | 55 | 53 | 14 | 3.1 |
| 1 | 19 | 64 | 2 | 3.1 |
| 2 | 50 | 73 | 23 | 3.1 |
| 3 | 21 | 90 | 49 | 3.5 |
| 4 | 45 | 49 | 31 | 2.7 |
| ... | ... | ... | ... | ... |
| 3895 | 40 | 28 | 32 | 4.2 |
| 3896 | 52 | 49 | 41 | 4.5 |
| 3897 | 46 | 33 | 24 | 2.9 |
| 3898 | 44 | 77 | 24 | 3.8 |
| 3899 | 52 | 81 | 33 | 3.1 |
3900 rows × 4 columns
correlation_matrix = data.corr()
correlation_matrix
| Age | Purchase Amount (USD) | Previous Purchases | Review Rating | |
|---|---|---|---|---|
| Age | 1.000000 | -0.010424 | 0.040445 | -0.021949 |
| Purchase Amount (USD) | -0.010424 | 1.000000 | 0.008063 | 0.030776 |
| Previous Purchases | 0.040445 | 0.008063 | 1.000000 | 0.004229 |
| Review Rating | -0.021949 | 0.030776 | 0.004229 | 1.000000 |
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Numeric Variables', fontsize=16)
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12, rotation=0)
plt.tight_layout()
plt.show()
21. Pairplots:¶
• Use pairplots to explore pairwise relationships between numeric variables.
sns.pairplot(df, vars=data, hue='Gender', diag_kind='kde', palette='Set2')
plt.suptitle('Pairplot of Numeric Variables by Gender', y=1.02, fontsize=16)
plt.show()
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Divya\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
with pd.option_context('mode.use_inf_as_na', True):
22. Category and Purchase Amount¶
Analysis: • Visualize the distribution of purchase amounts for each category using boxplots or violin plots.
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Category', y='Purchase Amount (USD)', palette='Set2')
plt.title('Distribution of Purchase Amounts by Category', fontsize=16)
plt.xlabel('Category', fontsize=12)
plt.ylabel('Purchase Amount', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
medians = df.groupby('Category')['Purchase Amount (USD)'].median()
medians
Category Accessories 60.0 Clothing 60.0 Footwear 60.0 Outerwear 54.5 Name: Purchase Amount (USD), dtype: float64
plt.figure(figsize=(12, 6))
sns.violinplot(data=df, x='Category', y='Purchase Amount (USD)', palette='Set2', inner='quartile')
plt.title('Distribution of Purchase Amounts by Category (Violin Plot)', fontsize=16)
plt.xlabel('Category', fontsize=12)
plt.ylabel('Purchase Amount', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
for index, median in enumerate(medians):
plt.text(index, median, f'{median:.2f}', ha='center', va='center', fontsize=10)
plt.show()